# Libraries used for this analysis
!pip install folium # installs folium
import pandas as pd # imports pandas
import numpy as np # imports numpy
import matplotlib.pyplot as plt # to plot graphs
from matplotlib.pyplot import figure # to modify size of figures
import matplotlib.patches as mpatches # for
import datetime as dt # for datetime objects
import folium # to create maps
from branca.colormap import LinearColormap # to create color maps
import seaborn as sns # imports seaborn
import warnings
warnings.filterwarnings("ignore")
# donations from Alabama to California # parse in dates as datatimes
a_c = pd.read_csv('alabama_to_california.csv', parse_dates=['contribution_receipt_date'])
# take a sample of each dataset since total dataset is so large
sample1 = a_c.sample(frac=0.10, random_state=99)
# donations from Colorado to Kentucky
c_k = pd.read_csv('colorado_to_kentucky.csv', parse_dates=['contribution_receipt_date'])
sample2 = c_k.sample(frac=0.10, random_state=99)
# donations from Louisiana to New Hampshire
l_n = pd.read_csv('louisiana_to_newhampshire.csv', parse_dates=['contribution_receipt_date'])
sample3 = l_n.sample(frac=0.10, random_state=99)
# donations from New Jersey to South Dakota
n_s = pd.read_csv('newjersey_to_southdakota.csv', parse_dates=['contribution_receipt_date'])
sample4 = n_s.sample(frac=0.10, random_state=99)
# donations from Tennessee to Wyoming
t_q = pd.read_csv('tennessee_to_wyoming.csv', parse_dates=['contribution_receipt_date'])
sample5 = t_q.sample(frac=0.10, random_state=99)
# combine all of the sampled datasets into one
d1 = sample1.append(sample2, ignore_index=True)
d2 = d1.append(sample3, ignore_index=True)
d3 = d2.append(sample4, ignore_index=True)
data = d3.append(sample5, ignore_index=True)
# final result, 10% of the total dataset of all donations made in the first 3 Quarters from donors exceeding $200
data.head()
# dataset to match states with latitude and longitude
states_loc = pd.read_csv('statelatlong.csv')
# dataset with state populations
states_pop = pd.read_csv('State Populations.csv')
# rename column in population data set to make it cleaner for final states dataset
states_pop.rename(columns={'State': 'City'}, inplace=True)
# merge the datasets together into one
states = pd.merge(states_loc, states_pop, on='City', how='inner')
# final states dataset used
states.head()
# dataset to convert zip codes to counties
counties = pd.read_csv('zip_county_fips_2018_03.csv')
# converts zip column to int values
counties.zip = counties['zip'].astype('int')
# final counties dataset used for analysis
counties.head()
# dataset used to classify occupations based on SOC
occupations = pd.read_csv('https://raw.githubusercontent.com/dawaldron/data-visualizations/master/actblue_data_2020/titleocc_xw_final.csv')
# drop unnecessary columns for this analysis
occupations.drop(columns = ['freq', 'source', 'onetcode', 'onettitle'], inplace = True)
# final occupations dataset used
occupations.head()
# data set used to group occupations into SOC groupings
occ_agg = pd.read_csv('https://raw.githubusercontent.com/dawaldron/data-visualizations/master/actblue_data_2020/occgrpnm.csv')
# Only need the first two numbers of the SOC number to match subclasses of occupations
occ_agg['occgrpcd'] = occ_agg['occgrpcd'].str[:2]
# final occupational grouping dataset used
occ_agg.head()
# take the needed columns from the original data that we need for this analysis
data_clean = data[['committee_id', 'report_type', 'contributor_name',
'contributor_state', 'contributor_zip', 'contributor_occupation', 'contribution_receipt_date',
'contribution_receipt_amount', 'contributor_aggregate_ytd']]
# convert some columns to numerical values
data_clean['contribution_receipt_amount'] = pd.to_numeric(data_clean['contribution_receipt_amount'])
data_clean['contributor_aggregate_ytd'] = pd.to_numeric(data_clean['contributor_aggregate_ytd'])
# clean zip code data to only having the 6 digits associated with each zip code
data_clean['contributor_zip'] = data_clean['contributor_zip'].astype(str).str[:5]
# drop any donations that exceeded the limit as well as drop an recended donations
data_clean = data_clean.drop(data_clean[(data_clean['contribution_receipt_amount'] <= 0) &
(data_clean['contribution_receipt_amount'] > 2800)].index)
# to create recipient name column
names = []
for index, row in data_clean.iterrows():
if row['committee_id'] == 'C00696948': # Sander's Presidential Committee
names.append('Sanders')
elif row['committee_id'] == 'C00580100': # Trump's Presidential Committee
names.append('Trump')
elif row['committee_id'] == 'C00703975': # Biden's Presidential Committee
names.append('Biden')
elif row['committee_id'] == 'C00693234': # Warren's Presidential Committee
names.append('Warren')
elif row['committee_id'] == 'C00697441': # Buttigieg's Presidential Committee
names.append('Buttigieg')
elif row['committee_id'] == 'C00696419': # Klobuchar's Presidential Committee
names.append('Klobuchar')
else:
print('error')
# insert new column into the data_clean dataset
data_clean.insert(0, "recipient_name", names, True)
# drop committee_id column since we now have recipient name
data_clean.drop(columns = 'committee_id', inplace = True)
# Add new column to represent party id, True: Rep, False: Dem
data_clean['Party'] = data_clean['recipient_name'] == 'Trump'
# final dataset used in analysis
data_clean.head()
# function that creates a bar plot given a dataset, the column being meausured, the plot, and color of the bars
def bar_plot_func(data, column, p, color):
# calculate total number of donations in given dataset
total_donations = data.shape[0]
# to make the plot bigger
figure(num=None, figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
# calculate the number of donations in each of the column's values. In this case either Quarters or Candidates
donos = pd.DataFrame(data[column].value_counts()).sort_index()
# calculate percentages of total donations
donos['% of Total Donations'] = (donos[column].astype(float) / total_donations) * 100
# create bar plot
p.bar(donos.index, donos['% of Total Donations'], align='center', alpha=0.5,
color= color)
# show the percentages on the bar plot
for i, v in enumerate(donos['% of Total Donations']):
p.text(i - .07, v + 1, "{0:.2f}".format(v) + "%", color='black', fontweight='bold')
return p
# produce bar plot for percentage of donations per quarter
quater_graph = bar_plot_func(data_clean, 'report_type', plt, ['tab:red', 'tab:green', 'tab:cyan'])
# to change the labels, add grid, background color, title
plt.xlabel('Quarter')
plt.ylabel('Percentage of Donations')
plt.title('Percentage of Donations per Quarter', fontsize=16, fontweight='bold')
ax = plt.gca()
ax.set_facecolor('ghostwhite')
plt.axes().yaxis.grid()
plt.show()
# function that creates a scatter plot based on dataset, plot, figure, and color
def count_func(data, p, figure, color):
# calculates total donations in dataset
total_donations = data.shape[0]
# determines the size of the plot
figure
# counts the number of donations recieved on that date
count = pd.DataFrame(data['contribution_receipt_date'].value_counts()).sort_index()
# creates new column to store dates as integers from 0 to 280
count['Date'] = count.index.map(dt.datetime.toordinal) - 737060
# x-value is between 0 and 280
x = count['Date']
# y-value is the percentage of donations recieved on that day
y = (count['contribution_receipt_date'] / total_donations) * 100
# to create curved fitted line for the data
poly = np.polyfit(x, y, 2)
f = np.poly1d(poly)
x_new = np.linspace(x[0], x[-1], 50)
y_new = f(x_new)
# add scatter plot to the given plot
p.scatter(x, y, c = color)
# add fitted line to given plot
p.plot(x_new, y_new, c = 'tab:orange')
# to match x index with the appropriate month
index = [0, 31, 59, 90, 120, 151, 181, 212, 243, 273]
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct']
# to add labels, background color, x, y limits, grid
p.set_xticks(index)
p.set_xticklabels(months)
p.set_xlabel('Time')
p.set_ylabel('Percentage of Donations')
p.set_xlim([-10, 280])
p.set_ylim(bottom=0)
p.set_facecolor('ghostwhite')
p.grid()
return p
# to create a subplot to show the plot
fig, ax = plt.subplots(1, 1)
# to make the plot bigger
fig.set_figheight(10)
fig.set_figwidth(20)
# adds lines marking where the Quarters end
plt.axvline(x=90, linestyle = '-', color = 'tab:red')
plt.axvline(x=181, linestyle = '-', color = 'tab:green')
plt.axvline(x=273, linestyle = '-', color = 'tab:cyan')
# call to function to produce scatter plot
count_func(data_clean, ax, fig, 'tab:blue')
# add appropriate title
ax.set_title('Percentage of Donations over Time', fontsize=16, fontweight='bold')
# add patches used for the legend of the graph
Q1 = mpatches.Patch(color='tab:red', label='End of Q1')
Q2 = mpatches.Patch(color='tab:green', label='End of Q2')
Q3 = mpatches.Patch(color='tab:cyan', label='End of Q3')
# add the legend to the graph
plt.legend(handles=[Q1, Q2, Q3])
fig.show()
# creates two subplots
fig, ax = plt.subplots(1, 2)
# makes entire figure larger
fig.set_figheight(10)
fig.set_figwidth(20)
# the two different titles for the two graphs
title = ['% of Donations for Each Candidate', '% of Total Donors for Each Candidate']
# the colors used to represent each candidate
color = ['black', 'tab:purple', 'tab:cyan', 'tab:blue', 'tab:red', 'tab:green']
# add grid, labels, subtitles, background color
for i in range(len(ax)):
ax[i].grid(axis = 'y')
ax[i].set_xlabel('Candidate')
ax[i].set_ylabel('Percentage of Total Donations')
ax[i].set_title(title[i],fontsize=16, fontweight='bold')
ax[i].set_facecolor('ghostwhite')
# adds first bar plot to the first subplot
bar_plot_func(data_clean, 'recipient_name', ax[0], color)
# create a dataset that has the number of donors
donors = data_clean.drop_duplicates(subset=['contributor_name', 'recipient_name'])
# adds second bar plot to the second subplot
bar_plot_func(donors, 'recipient_name', ax[1], color)
fig.show()
# creates a figure with 3 by 2 subplots, one plot for each candidate
fig, ax = plt.subplots(3, 2)
# make the figure larger
fig.set_figheight(20)
fig.set_figwidth(20)
# the different colors for the scatter points in each graph
color = ['black', 'tab:purple', 'orange', 'tab:blue', 'tab:red', 'tab:green']
# the names for each of candidates represented in each subplot
name = ['Biden', 'Buttigieg', 'Klobuchar', 'Sanders', 'Trump', 'Warren']
# adds title and scatter plots to each subplot
w = 0
for i in range(len(ax)):
for j in range(len(ax[i])):
ax[i, j].set_title(name[w], fontsize=14, fontweight='bold')
count_func(data_clean[data_clean['recipient_name'] == name[w]], ax[i, j], fig, color[w])
w += 1
# addes main title to the main figure
fig.suptitle('Percentage of Donations Over Time For Each Candidate', fontsize=16, fontweight='bold')
fig.show()
# function that adds a choropleth map to the given map based on given data, color, and name of candidate
def state_map_func(data, m, color, name):
# counts number of donations in each state
donations_state = pd.DataFrame(data['contributor_state'].value_counts()).sort_index()
# adds column of the state each row represents
donations_state['donations_state'] = donations_state.index
# merges this dataset with the states dataset that we read in
donations_state = pd.merge(donations_state, states, left_on = 'donations_state', right_on = 'State', how='inner')
# calculates the donations per capita in each state
donations_state['%'] = ((donations_state['contributor_state'].astype(int) * 10) /
donations_state['2018 Population']) * 100
# used to create folium choropleth map that is added to the given map
folium.Choropleth(
geo_data= 'us-states.json', # json file that shows the regions of each state
name= name, # name of candidate added to the layer control
data=donations_state, # dataset we are pulling from
columns=['donations_state', '%'], # the state and donations per capita in that state
key_on = 'feature.id', # the id value used to match info from json file
fill_color= color, # the color used for the choropleth
fill_opacity=0.7, # changes the filled in color's opacity
line_opacity=0.2, # changes the line's opacity
legend_name='% of Donations for ' # add title for each legend
+ name + ' Per Capita',
overlay=False, # makes it so that user cannot view two map's at once
show=False # map isn't shown when opened
).add_to(m)
return m
# create new folium map of the United States
s_m = folium.Map(location=[37, -102], zoom_start=4)
# different Folium colors to represent each candidate
folium_colors = ['Greys', 'Purples', 'Oranges', 'Blues', 'Reds', 'Greens']
# adds choropleth map to s_m for each candidate
for i in range(6):
s_m = state_map_func(donors[donors['recipient_name'] == name[i]], s_m, folium_colors[i], name[i])
# adds ability to switch to different maps
folium.LayerControl().add_to(s_m)
s_m
# function that produces choropleth map for all the candidates on the county level
def county_map_func(best, m, map1):
# colors used to represent each candidate
color = ['black', 'purple', 'orange', 'blue', 'red', 'green']
# a list to add all of the color scales to
color_scale = []
for i in range(len(color)):
color_scale.append(LinearColormap(['white', color[i]], vmin = min(best['%']), vmax = max(best['%'])))
# function that gets the right color for the choropleth map given county value from json
def get_color(feature):
# value is the row with the county from feature
value = best.loc[best['county'] == feature['properties']['NAME']]
# if no row for given county, return white
if value is None:
return '#ffffff'
# if more than one County row, return white
elif len(value) != 1:
return '#ffffff'
# used for the first county map, determines the color used for choropleth
if map1:
if value['name'].item() == 'Biden': # Biden
return color_scale[0](value['%'].item())
elif value['name'].item() == 'Buttigieg': # Buttigieg
return color_scale[1](value['%'].item())
elif value['name'].item() == 'Klobuchar': # Klobuchar
return color_scale[2](value['%'].item())
elif value['name'].item() == 'Sanders': # Sanders
return color_scale[3](value['%'].item())
elif value['name'].item() == 'Warren': # Warren
return color_scale[5](value['%'].item())
else:
return '#ffffff'
# for the second map
else:
if value['name'].item() == 'Trump':
if value['%'].item() > 50.00:
return color_scale[4](value['%'].item()) # Trump higher percentage, use his color
else:
return color_scale[3](100 - value['%'].item()) # Dems higher percentage, use blue
else:
return color_scale[3](100) # No Trump, means Dems 100%
# creates folium map
folium.GeoJson(
data = 'cb_2015_us_county_5m.json', # json that has county regions
style_function = lambda feature: { # determines the color to add
'fillColor': get_color(feature), # calls get_color function
'fillOpacity': 0.7, # changes opacity of filled areas
'color' : 'black', # color of lines
'weight' : 0, # lines not to be shown for counties
},
name = 'map', overlay=False).add_to(m)
return m
# function that converts given data to be used for county level maps
def convert_for_map(data):
# counts donors per zip code
donations_county = pd.DataFrame(data.groupby('contributor_zip')['recipient_name'].value_counts()).sort_index()
donations_county.rename(columns={'recipient_name':'freq'}, inplace = True)
donations_county = donations_county.reset_index()
# converts zip column in donations_county to type float
donations_county['contributor_zip'] = donations_county['contributor_zip'].astype('float')
# merges counties with donations_county
donations_county = pd.merge(counties, donations_county, left_on = 'zip', right_on = 'contributor_zip',
how='inner')
# calculates number of donations in each county
sums = pd.DataFrame(donations_county.groupby('countyname')['freq'].sum())
sums = sums.reset_index()
sums.rename(columns= {'freq':'total_county', 'countyname':'county'}, inplace = True)
# calculates number of donations in each county for each candidate
sums_candidates = pd.DataFrame(donations_county.groupby(['countyname', 'recipient_name'])['freq'].sum())
sums_candidates
# addes county and name colomns to dataset
county = []
name = []
for index, row in sums_candidates.iterrows():
county.append(index[0])
name.append(index[1])
sums_candidates['name'] = name
sums_candidates['county'] = county
# combines datasets
sums_both = pd.merge(sums_candidates, sums, on='county', how='inner')
# calculates % of donations in county for each candidate
sums_both['%'] = sums_both['freq'] / sums_both['total_county']
# finds indices with the highest percentage of donations for each county
idx = sums_both.groupby('county')['%'].transform(max) == sums_both['%']
# drops other rows that arent included in idx
best = sums_both[idx]
# takes out County substring for county names in county column
best['county'] = best['county'].map(lambda x: x.replace(' County', ''))
# multiply the % by 100
best['%'] = best['%'] * 100
# take out needed columns for maps
map_dict = pd.DataFrame(best[['county', 'name', '%']].copy())
return map_dict
# Find rows that are for Trump
ind = donors[donors['recipient_name'] == 'Trump'].index
# Drops those rows to only have Dem candidates left
donors_minus_trump = donors.drop(ind)
# Convert dataset to a dataset we can use for map
donors_minus_trump_map = convert_for_map(donors_minus_trump)
# final dataset used for first county level map
donors_minus_trump_map.head()
# Creates new folium map of United States
c_m1 = folium.Map(location=[37, -102], zoom_start=4)
# Adds Choropleth map
map1 = county_map_func(donors_minus_trump_map, c_m1, True)
map1